﻿using System.Linq;
using System.Numerics;

namespace Masuit.Tools.Strings;

public class SimHash
{
    private readonly string _tokens;
    private readonly BigInteger _strSimHash;
    private readonly int _hashBits = 128;

    public BigInteger StrSimHash => _strSimHash;

    public SimHash(string tokens, int hashBits)
    {
        _tokens = tokens;
        _hashBits = hashBits;
        _strSimHash = GetSimHash();
    }

    public SimHash(string tokens)
    {
        _tokens = tokens;
        _strSimHash = GetSimHash();
    }

    private BigInteger GetSimHash()
    {
        var v = new int[_hashBits];
        var stringTokens = new SimTokenizer(_tokens);
        while (stringTokens.HasMoreTokens())
        {
            var temp = stringTokens.NextToken();
            var t = Hash(temp);
            for (var i = 0; i < _hashBits; i++)
            {
                var bitmask = BigInteger.One << i;
                if ((t & bitmask).Sign != 0)
                {
                    v[i] += 1;
                }
                else
                {
                    v[i] -= 1;
                }
            }
        }

        var fingerprint = BigInteger.Zero;
        for (var i = 0; i < _hashBits; i++)
        {
            if (v[i] >= 0)
            {
                fingerprint += BigInteger.Parse("1") << i;
            }
        }

        return fingerprint;
    }

    private BigInteger Hash(string source)
    {
        if (string.IsNullOrEmpty(source))
        {
            return BigInteger.Zero;
        }

        var sourceArray = source.ToCharArray();
        var x = new BigInteger((long)sourceArray[0] << 7);
        var m = BigInteger.Parse("1000003");
        var mask = BigInteger.Pow(new BigInteger(2), _hashBits) - BigInteger.One;
        x = sourceArray.Select(item => new BigInteger((long)item)).Aggregate(x, (current, temp) => ((current * m) ^ temp) & mask);
        x ^= new BigInteger(source.Length);
        if (x.Equals(BigInteger.MinusOne))
        {
            x = new BigInteger(-2);
        }

        return x;
    }

    public int HammingDistance(SimHash other)
    {
        var m = (BigInteger.One << _hashBits) - BigInteger.One;
        var x = (_strSimHash ^ other._strSimHash) & m;
        var tot = 0;
        while (x.Sign != 0)
        {
            tot += 1;
            x &= x - BigInteger.One;
        }

        return tot;
    }
}

//简单的分词法，直接将中文分成单个汉字。可以用其他分词法代替
public class SimTokenizer
{
    private readonly string _source;
    private int _index;
    private readonly int _length;

    public SimTokenizer(string source)
    {
        _source = source;
        _index = 0;
        _length = (source ?? "").Length;
    }

    public bool HasMoreTokens()
    {
        return _index < _length;
    }

    public string NextToken()
    {
        var s = _source.Substring(_index, 1);
        _index++;
        return s;
    }
}
